Multicollinearity and VIF Analysis for car_data_final

rm(list=ls())

suppressPackageStartupMessages({
  library(dplyr)
  library(tidyr)
  library(car)
  library(corrplot)
  library(heatmaply)
  library(plotly)
  library(ggcorrplot)
})

load("car_data_final.RData")
NA_counts <- colSums(is.na(car_data_final))

variables_with_NAs <- names(NA_counts[NA_counts > 0])
variables_with_NAs
[1] "make"                          "fuel_tank_capacity_approx_gal"
sum(NA_counts)
[1] 135
na_rows <- which(is.na(car_data_final$make))

NA_rows_make <- subset(car_data_final, is.na(make))
NA_rows_fuelTank <- subset(car_data_final, is.na(fuel_tank_capacity_approx_gal))
split_names <- strsplit(NA_rows_make$make_model, " ")
NA_rows_make$make <- sapply(split_names, '[', 1)
NA_rows_make$model <- sapply(split_names, function(x) paste(x[-1], collapse = " "))

car_data_final$make[na_rows] <- NA_rows_make$make
car_data_final$model[na_rows] <- NA_rows_make$model

car_data_final$fuel_tank_capacity_approx_gal[1595] <- 19.0
car_data_final$fuel_tank_capacity_approx_gal[1596] <- 19.0
car_data_final$fuel_tank_capacity_approx_gal[1597] <- 19.0
car_data_final$fuel_tank_capacity_approx_gal[1598] <- 19.0
car_data_final$fuel_tank_capacity_approx_gal[6566] <- 23.8
factors_vars <- c("parking_aid", 
                  "tire_pressure_monitor",
                  "backup_camera",
                  "drivetrain")
                  
remove_vars <- c("make_model",
                 "make",
                 "model",
                 #"msrp",
                 "engine",
                 #"epa_class",
                 "transmission",
                 "displacement",
                 "front_tire_size",
                 "rear_tire_size",
                 "year")

correlation_data <- car_data_final %>%
  mutate(across(all_of(factors_vars), as.factor)) %>%
  select(-all_of(remove_vars))
dummy_vars <- model.matrix(~1+., data = correlation_data)

model <- lm(msrp ~ ., data = correlation_data)
#Making a correlation matrix
cor_matrix <- cor(dummy_vars)
Warning in cor(dummy_vars): the standard deviation is zero
var_names <- colnames(dummy_vars)

#corrplot(cor_matrix, method = "color", type = "upper")

#VIF values
vif_results <- vif(model)
print(vif_results)
                                      GVIF Df GVIF^(1/(2*Df))
fuel_economy_estcombined_mpg    262.314898  1       16.196138
gas_city                        113.896378  1       10.672225
gas_hwy                          48.585787  1        6.970351
epa_class                     84209.273274 29        1.215962
drivetrain                        8.477889  5        1.238308
passenger_capacity                3.954158  1        1.988507
passenger_doors                  17.728704  1        4.210547
body_style                    78689.905522 16        1.422320
fuel_tank_capacity_approx_gal     4.960368  1        2.227188
parking_aid                       1.432587  1        1.196907
tire_pressure_monitor             2.516836  1        1.586454
backup_camera                     1.199611  1        1.095268
#from https://towardsdatascience.com/

corr_simple <- function(data=df,sig=0.5){
  #convert data to numeric in order to run correlations
  #convert to factor first to keep the integrity of the data - each value will become a number rather than turn into NA
  df_cor <- data %>% mutate_if(is.character, as.factor)
  df_cor <- df_cor %>% mutate_if(is.factor, as.numeric)
  #run a correlation and drop the insignificant ones
  corr <- cor(df_cor)
  #prepare to drop duplicates and correlations of 1     
  corr[lower.tri(corr,diag=TRUE)] <- NA 
  #drop perfect correlations
  corr[corr == 1] <- NA 
  #turn into a 3-column table
  corr <- as.data.frame(as.table(corr))
  #remove the NA values from above 
  corr <- na.omit(corr) 
  #select significant values  
  corr <- subset(corr, abs(Freq) > sig) 
  #sort by highest correlation
  corr <- corr[order(-abs(corr$Freq)),] 
  #print table
  print(corr)
  #turn corr back into matrix in order to plot with corrplot
  mtx_corr <- reshape2::acast(corr, Var1~Var2, value.var="Freq")
  
  #plot correlations visually
  corrplot(mtx_corr, is.corr=FALSE, tl.col="black", na.label=" ")
}

corr_simple(correlation_data)
                            Var1                          Var2       Freq
28  fuel_economy_estcombined_mpg                      gas_city  0.9864800
41  fuel_economy_estcombined_mpg                       gas_hwy  0.9627708
42                      gas_city                       gas_hwy  0.9138732
121                      gas_hwy fuel_tank_capacity_approx_gal -0.7429677
119 fuel_economy_estcombined_mpg fuel_tank_capacity_approx_gal -0.7209381
120                     gas_city fuel_tank_capacity_approx_gal -0.6827299
98            passenger_capacity               passenger_doors  0.5848055

corr_plot <- ggcorrplot(cor_matrix)
corr_plot

corr_plotly <- ggplotly(corr_plot)
corr_plotly